R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(corrplot)
## corrplot 0.84 loaded
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(e1071)
library(useful)
## Registered S3 methods overwritten by 'useful':
##   method       from    
##   autoplot.acf forecast
##   fortify.ts   forecast
## 
## Attaching package: 'useful'
## The following object is masked from 'package:xts':
## 
##     reclass
library(xts)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(DiagrammeR)
library(ranger)
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(vtreat)
## Loading required package: wrapr
## 
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
## 
##     coalesce

Notes on features by Walmart:

This file contains additional data related to the store, department, and regional activity for the given dates. It contains the following fields:

Store - the store number Date - the week Temperature - average temperature in the region Fuel_Price - cost of fuel in the region MarkDown1-5 - anonymized data related to promotional markdowns that Walmart is running. MarkDown data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA. CPI - the consumer price index Unemployment - the unemployment rate IsHoliday - whether the week is a special holiday week For convenience, the four holidays fall within the following weeks in the dataset (not all holidays are in the data):

Super Bowl: 12-Feb-10, 11-Feb-11, 10-Feb-12, 8-Feb-13 Labor Day: 10-Sep-10, 9-Sep-11, 7-Sep-12, 6-Sep-13 Thanksgiving: 26-Nov-10, 25-Nov-11, 23-Nov-12, 29-Nov-13 Christmas: 31-Dec-10, 30-Dec-11, 28-Dec-12, 27-Dec-13

#loading three data sets "train.csv", "stores.csv", "features.csv" downloaded from Kaggle

salesdata<-read.csv("train.csv", sep = ",")
str(salesdata)
## 'data.frame':    421570 obs. of  5 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Dept        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Factor w/ 143 levels "2010-02-05","2010-02-12",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Weekly_Sales: num  24924 46039 41596 19404 21828 ...
##  $ IsHoliday   : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
storesdata<-read.csv("stores.csv", sep = ",")
str(storesdata)
## 'data.frame':    45 obs. of  3 variables:
##  $ Store: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Type : Factor w/ 3 levels "A","B","C": 1 1 2 1 2 1 2 1 2 2 ...
##  $ Size : int  151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...
features<-read.csv("features.csv", sep = ",")
str(features)
## 'data.frame':    8190 obs. of  12 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Factor w/ 182 levels "2010-02-05","2010-02-12",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ MarkDown1   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ IsHoliday   : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...

Checking data quality and find no NA in sales and stores data

sum(is.na(salesdata))
## [1] 0
sum(is.na(storesdata))
## [1] 0

Merging three data sets by common elements

mergedata <- merge(salesdata,storesdata, by.salesdata = T)
sum(is.na(mergedata))
## [1] 0
alldata<-merge(mergedata, features, by = intersect(names(salesdata), names(features)))
str(alldata)
## 'data.frame':    421570 obs. of  16 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Factor w/ 143 levels "2010-02-05","2010-02-12",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : int  1 26 17 45 28 79 55 5 58 7 ...
##  $ Weekly_Sales: num  24924.5 11737.1 13223.8 37.4 1085.3 ...
##  $ Type        : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Temperature : num  42.3 42.3 42.3 42.3 42.3 ...
##  $ Fuel_Price  : num  2.57 2.57 2.57 2.57 2.57 ...
##  $ MarkDown1   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...

Review attributes and modify data type as needed

#change $Store, $Department variables to factor variables
alldata$Store <- as.factor(alldata$Store)
alldata$Dept <- as.factor(alldata$Dept)

#change $Date variable from factor to date variable
alldata$Date <- as.Date(alldata$Date)

#count the position of $Date(weekly) in a year
alldata$Week <-isoweek (alldata$Date)

str(alldata)
## 'data.frame':    421570 obs. of  17 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-05" ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : Factor w/ 81 levels "1","2","3","4",..: 1 25 16 44 27 65 53 5 55 7 ...
##  $ Weekly_Sales: num  24924.5 11737.1 13223.8 37.4 1085.3 ...
##  $ Type        : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Temperature : num  42.3 42.3 42.3 42.3 42.3 ...
##  $ Fuel_Price  : num  2.57 2.57 2.57 2.57 2.57 ...
##  $ MarkDown1   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ Week        : num  5 5 5 5 5 5 5 5 5 5 ...
summary(alldata)
##      Store             Date            IsHoliday            Dept       
##  13     : 10474   Min.   :2010-02-05   Mode :logical   1      :  6435  
##  10     : 10315   1st Qu.:2010-10-08   FALSE:391909    2      :  6435  
##  4      : 10272   Median :2011-06-17   TRUE :29661     3      :  6435  
##  1      : 10244   Mean   :2011-06-18                   4      :  6435  
##  2      : 10238   3rd Qu.:2012-02-24                   7      :  6435  
##  24     : 10228   Max.   :2012-10-26                   8      :  6435  
##  (Other):359799                                        (Other):382960  
##   Weekly_Sales    Type            Size         Temperature    
##  Min.   : -4989   A:215478   Min.   : 34875   Min.   : -2.06  
##  1st Qu.:  2080   B:163495   1st Qu.: 93638   1st Qu.: 46.68  
##  Median :  7612   C: 42597   Median :140167   Median : 62.09  
##  Mean   : 15981              Mean   :136728   Mean   : 60.09  
##  3rd Qu.: 20206              3rd Qu.:202505   3rd Qu.: 74.28  
##  Max.   :693099              Max.   :219622   Max.   :100.14  
##                                                               
##    Fuel_Price      MarkDown1          MarkDown2          MarkDown3        
##  Min.   :2.472   Min.   :    0.27   Min.   :  -265.8   Min.   :   -29.10  
##  1st Qu.:2.933   1st Qu.: 2240.27   1st Qu.:    41.6   1st Qu.:     5.08  
##  Median :3.452   Median : 5347.45   Median :   192.0   Median :    24.60  
##  Mean   :3.361   Mean   : 7246.42   Mean   :  3334.6   Mean   :  1439.42  
##  3rd Qu.:3.738   3rd Qu.: 9210.90   3rd Qu.:  1926.9   3rd Qu.:   103.99  
##  Max.   :4.468   Max.   :88646.76   Max.   :104519.5   Max.   :141630.61  
##                  NA's   :270889     NA's   :310322     NA's   :284479     
##    MarkDown4          MarkDown5             CPI         Unemployment   
##  Min.   :    0.22   Min.   :   135.2   Min.   :126.1   Min.   : 3.879  
##  1st Qu.:  504.22   1st Qu.:  1878.4   1st Qu.:132.0   1st Qu.: 6.891  
##  Median : 1481.31   Median :  3359.4   Median :182.3   Median : 7.866  
##  Mean   : 3383.17   Mean   :  4629.0   Mean   :171.2   Mean   : 7.960  
##  3rd Qu.: 3595.04   3rd Qu.:  5563.8   3rd Qu.:212.4   3rd Qu.: 8.572  
##  Max.   :67474.85   Max.   :108519.3   Max.   :227.2   Max.   :14.313  
##  NA's   :286603     NA's   :270138                                     
##       Week      
##  Min.   : 1.00  
##  1st Qu.:14.00  
##  Median :26.00  
##  Mean   :25.83  
##  3rd Qu.:38.00  
##  Max.   :52.00  
## 

Counting NAs

#count numbers of NAs
apply(alldata, MARGIN = 2, function (x) sum(is.na(x)))
##        Store         Date    IsHoliday         Dept Weekly_Sales 
##            0            0            0            0            0 
##         Type         Size  Temperature   Fuel_Price    MarkDown1 
##            0            0            0            0       270889 
##    MarkDown2    MarkDown3    MarkDown4    MarkDown5          CPI 
##       310322       284479       286603       270138            0 
## Unemployment         Week 
##            0            0
#calculate percentages of NAs
apply(alldata, MARGIN = 2, function (x) sum(is.na(x))/length(x))
##        Store         Date    IsHoliday         Dept Weekly_Sales 
##    0.0000000    0.0000000    0.0000000    0.0000000    0.0000000 
##         Type         Size  Temperature   Fuel_Price    MarkDown1 
##    0.0000000    0.0000000    0.0000000    0.0000000    0.6425718 
##    MarkDown2    MarkDown3    MarkDown4    MarkDown5          CPI 
##    0.7361103    0.6748085    0.6798468    0.6407904    0.0000000 
## Unemployment         Week 
##    0.0000000    0.0000000
#Results: five "Markdown" variables have above 65% of NAs in their data sets respectively

Check negative numbers

table(sign(alldata$Weekly_Sales))
## 
##     -1      0      1 
##   1285     73 420212
table(sign(alldata$MarkDown1))
## 
##      1 
## 150681
table(sign(alldata$MarkDown2))
## 
##     -1      0      1 
##   1311    207 109730
table(sign(alldata$MarkDown3))
## 
##     -1      0      1 
##    257     67 136767
table(sign(alldata$MarkDown4))
## 
##      1 
## 134967
table(sign(alldata$MarkDown5))
## 
##      1 
## 151432

Exploring data characteristics

#log Weekly Department-Store Sales histogram
#log per department per store weekly sales appear mount-shaped and left skewed
ggplot(alldata) + geom_histogram (aes(x= Weekly_Sales), bins = 100) + scale_x_continuous(trans = "log10")
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1358 rows containing non-finite values (stat_bin).

#aggregate department (total 99 department ID) "Weekly Sales" variable into store (total 45 stores) weekly sales , log histogram
#store weekly sales distribution is not normally distributed, need to separate data by "Type" variable
alldata %>% dplyr::select (Weekly_Sales, Store, Date) %>% group_by(Store, Date) %>% summarise (WKlyStoreSales = sum(Weekly_Sales))%>%
  ggplot(aes(x=WKlyStoreSales)) + geom_histogram() + scale_x_continuous(trans = "log10")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#store weekly sales grouped by Type, log histogram
#store type data shows mount shaped
alldata %>% dplyr::select (Weekly_Sales, Store, Date, Type) %>% group_by(Store, Date, Type) %>% summarise (WKlyStoreSales = sum(Weekly_Sales))%>%
  ggplot(aes(x=WKlyStoreSales)) + geom_histogram() + facet_grid(Type~.) + scale_x_continuous(trans = "log10") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#divide store weekly sale data by size and graph histogram
alldata %>%   mutate (WKlySizeSales = Weekly_Sales/Size)  %>%
  group_by (Date, Store) %>%
  summarize (PerStoreSizeSales = sum(WKlySizeSales)) %>%
  ggplot(aes(x=PerStoreSizeSales), binwidth = 10) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#what sizes of store belong to which store type and count the numbers 
alldata %>% dplyr::select (Store, Size, Type) %>% distinct () %>%
ggplot( aes(x = Size)) + geom_histogram() + facet_grid(Type~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Seasonality by store types: sales spike for Thanksgiving and Christmas holidays

#plot store weekly sales data by store type (A,B,C) against 143 week-time period
#strong seasonality
alldata %>% group_by (Type, Date) %>% summarize (WeeklySales = sum(Weekly_Sales)) %>%
  ggplot(aes (x = Date, y = WeeklySales)) + geom_line(aes(color = factor(Type), group = Type)) +scale_y_log10()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

#plot store sales per size against 143-week time period to check if seasonality stays
alldata %>% group_by (Date) %>% summarize (Mean_WeeklySales = mean(Weekly_Sales)) %>%
  ggplot(aes (x = Date, y = Mean_WeeklySales)) + geom_line()+geom_smooth()+ scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Checking IsHoliday against holiday calendars

#IsHoliday variable appears at the same week in a year
ggplot(alldata, aes(x = IsHoliday, y = Week)) + geom_point()

alldata %>%dplyr:: select (Date, IsHoliday, Week) %>% filter (IsHoliday == "TRUE") %>% distinct()
##          Date IsHoliday Week
## 1  2010-02-12      TRUE    6
## 2  2010-09-10      TRUE   36
## 3  2010-11-26      TRUE   47
## 4  2010-12-31      TRUE   52
## 5  2011-02-11      TRUE    6
## 6  2011-09-09      TRUE   36
## 7  2011-11-25      TRUE   47
## 8  2011-12-30      TRUE   52
## 9  2012-02-10      TRUE    6
## 10 2012-09-07      TRUE   36

weekly sales correlations with CPI, Fuel Price, Temprature, Unemployment

#corrplot shows no correlations 
attach(alldata)
M<-data.frame(Weekly_Sales,Fuel_Price, Temperature, CPI, Unemployment)
MCor<- cor(M)
corrplot( MCor, order = "AOE")

Markdown feature

#Filter dataset with MarkDowns
MKD<-alldata %>% filter(Date >= "2012-02-03")
MKD2<-alldata %>% filter(Date <"2012-02-03")

Check NAs in MarkDowns after 2012-02-03 (temporal split for test set)

#count NA numbers
apply(MKD, MARGIN = 2, function (x) sum(is.na(x)))  
##        Store         Date    IsHoliday         Dept Weekly_Sales 
##            0            0            0            0            0 
##         Type         Size  Temperature   Fuel_Price    MarkDown1 
##            0            0            0            0          457 
##    MarkDown2    MarkDown3    MarkDown4    MarkDown5          CPI 
##        36292        11813        12050            0            0 
## Unemployment         Week 
##            0            0
#count NA percentages
apply(MKD, MARGIN = 2, function (x) sum(is.na(x))/length(x))
##        Store         Date    IsHoliday         Dept Weekly_Sales 
##  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000 
##         Type         Size  Temperature   Fuel_Price    MarkDown1 
##  0.000000000  0.000000000  0.000000000  0.000000000  0.003953698 
##    MarkDown2    MarkDown3    MarkDown4    MarkDown5          CPI 
##  0.313977229  0.102199190  0.104249576  0.000000000  0.000000000 
## Unemployment         Week 
##  0.000000000  0.000000000

Check NAs in MarkDowns before 2012-02-03

#Markdown features have more than 88% NAs before 2012-02-03
apply(MKD2, MARGIN = 2, function (x) sum(is.na(x)))  
##        Store         Date    IsHoliday         Dept Weekly_Sales 
##            0            0            0            0            0 
##         Type         Size  Temperature   Fuel_Price    MarkDown1 
##            0            0            0            0       270432 
##    MarkDown2    MarkDown3    MarkDown4    MarkDown5          CPI 
##       274030       272666       274553       270138            0 
## Unemployment         Week 
##            0            0
apply(MKD2, MARGIN = 2, function (x) sum(is.na(x))/length(x))
##        Store         Date    IsHoliday         Dept Weekly_Sales 
##    0.0000000    0.0000000    0.0000000    0.0000000    0.0000000 
##         Type         Size  Temperature   Fuel_Price    MarkDown1 
##    0.0000000    0.0000000    0.0000000    0.0000000    0.8838167 
##    MarkDown2    MarkDown3    MarkDown4    MarkDown5          CPI 
##    0.8955756    0.8911178    0.8972848    0.8828559    0.0000000 
## Unemployment         Week 
##    0.0000000    0.0000000

Boxplot charts of MarkDown data after 2012-02-03 (because of less missing values for this period)

#subset MarkDown features
MKDdata<-cbind(MKD$MarkDown1, MKD$MarkDown2, MKD$MarkDown3, MKD$MarkDown4, MKD$MarkDown5)

#boxplot
boxplot(MKDdata)

#scatterplot department markdown data over 143-week period
MKD %>% ggplot(aes(x = Date, y = MarkDown1))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 457 rows containing missing values (geom_point).

MKD %>% ggplot(aes(x = Date, y = MarkDown2))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 36292 rows containing missing values (geom_point).

MKD %>% ggplot(aes(x = Date, y = MarkDown3))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 11813 rows containing missing values (geom_point).

MKD %>% ggplot(aes(x = Date, y = MarkDown4))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 12050 rows containing missing values (geom_point).

MKD %>% ggplot(aes(x = Date, y = MarkDown5))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

Correlation between MarkDowns and Weekly Sales

SM<-alldata %>% dplyr::select (Weekly_Sales, MarkDown1, MarkDown2, MarkDown3, MarkDown4, MarkDown5) %>%
  filter (Date >= "2012-02-03") 
ggplot(SM, aes(x=MarkDown1, y = Weekly_Sales)) + geom_point()
## Warning: Removed 457 rows containing missing values (geom_point).

ggplot(SM, aes(x = MarkDown2, y = Weekly_Sales)) + geom_point()
## Warning: Removed 36292 rows containing missing values (geom_point).

ggplot(SM, aes(x = MarkDown3, y = Weekly_Sales)) + geom_point()
## Warning: Removed 11813 rows containing missing values (geom_point).

ggplot(SM, aes(x= MarkDown4, y = Weekly_Sales))+geom_point()
## Warning: Removed 12050 rows containing missing values (geom_point).

ggplot(SM, aes(x=MarkDown5, y = Weekly_Sales)) + geom_point()

Drop MarkDown features and external features because of large NAs and no strong correlation with Weekly Sales

#drop features
data_clean<- alldata %>% dplyr::select(-MarkDown1, -MarkDown2, -MarkDown3, -MarkDown4, -MarkDown5, -CPI, -Temperature, -Fuel_Price, -Unemployment)

#keep features: weekly sales(numeric), store (factor), dept (factor), Date, IsHoliday(logic), Week(numeric), Type(factor), Size (integer)
str(data_clean)
## 'data.frame':    421570 obs. of  8 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-05" ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : Factor w/ 81 levels "1","2","3","4",..: 1 25 16 44 27 65 53 5 55 7 ...
##  $ Weekly_Sales: num  24924.5 11737.1 13223.8 37.4 1085.3 ...
##  $ Type        : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Week        : num  5 5 5 5 5 5 5 5 5 5 ...

Stepwise feature selection

#feature selection
#nullModel<-lm(Weekly_Sales~1, data_clean)
#fullModel<-lm(Weekly_Sales~Dept+Store+IsHoliday+Week+Size+Type, data = data_clean)
#SalesStep<-step(nullModel, scope = (list(lower=nullModel, upper = fullModel)), direction = "both")

Prepare data set for model building

#checking department-store level
# 3331 department-store levels
#data_clean$DeptID<-as.factor(paste("D", data_ts$Dept, "S", data_ts$Store, sep=""))
#levels(data_clean[,"DeptID"])

#drop DeptID variable due to computation inefficiency
#data_clean<-subset(data_clean, select = -c(DeptID))
str(data_clean)
## 'data.frame':    421570 obs. of  8 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-05" ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : Factor w/ 81 levels "1","2","3","4",..: 1 25 16 44 27 65 53 5 55 7 ...
##  $ Weekly_Sales: num  24924.5 11737.1 13223.8 37.4 1085.3 ...
##  $ Type        : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Week        : num  5 5 5 5 5 5 5 5 5 5 ...
#aggregate sales to store sales
#data size: 45 store weekly sales data for 143 weeks
data_agg_store<-data_clean %>% select(Weekly_Sales, Date, Store) %>% group_by(Date, Store) %>% summarise(Store_Weekly_Sales = sum(Weekly_Sales))
summary(data_agg_store)
##       Date                Store      Store_Weekly_Sales
##  Min.   :2010-02-05   1      : 143   Min.   : 209986   
##  1st Qu.:2010-10-08   2      : 143   1st Qu.: 553350   
##  Median :2011-06-17   3      : 143   Median : 960746   
##  Mean   :2011-06-17   4      : 143   Mean   :1046965   
##  3rd Qu.:2012-02-24   5      : 143   3rd Qu.:1420159   
##  Max.   :2012-10-26   6      : 143   Max.   :3818686   
##                       (Other):5577

Split train/test sets by temporal point “2012-02-03”, about 73/27 split by week numbers

train<- data_agg_store%>% filter (Date < "2012-02-03")
test<-data_agg_store %>% filter (Date >= "2012-02-03")
summary(train)
##       Date                Store      Store_Weekly_Sales
##  Min.   :2010-02-05   1      : 104   Min.   : 209986   
##  1st Qu.:2010-08-04   2      : 104   1st Qu.: 549523   
##  Median :2011-01-31   3      : 104   Median : 956062   
##  Mean   :2011-01-31   4      : 104   Mean   :1048286   
##  3rd Qu.:2011-07-30   5      : 104   3rd Qu.:1415272   
##  Max.   :2012-01-27   6      : 104   Max.   :3818686   
##                       (Other):4056
summary(test)
##       Date                Store      Store_Weekly_Sales
##  Min.   :2012-02-03   1      :  39   Min.   : 237130   
##  1st Qu.:2012-04-06   2      :  39   1st Qu.: 561700   
##  Median :2012-06-15   3      :  39   Median : 968897   
##  Mean   :2012-06-15   4      :  39   Mean   :1043441   
##  3rd Qu.:2012-08-24   5      :  39   3rd Qu.:1439366   
##  Max.   :2012-10-26   6      :  39   Max.   :2565260   
##                       (Other):1521
#create xts series
#create time index
train_Date_index<-seq(as.Date("2010-02-05"), length.out = 104, by = "week")
test_Date_index<-seq(as.Date("2012-02-03"), length.out = 39, by = "week")
train_Date_index
##   [1] "2010-02-05" "2010-02-12" "2010-02-19" "2010-02-26" "2010-03-05"
##   [6] "2010-03-12" "2010-03-19" "2010-03-26" "2010-04-02" "2010-04-09"
##  [11] "2010-04-16" "2010-04-23" "2010-04-30" "2010-05-07" "2010-05-14"
##  [16] "2010-05-21" "2010-05-28" "2010-06-04" "2010-06-11" "2010-06-18"
##  [21] "2010-06-25" "2010-07-02" "2010-07-09" "2010-07-16" "2010-07-23"
##  [26] "2010-07-30" "2010-08-06" "2010-08-13" "2010-08-20" "2010-08-27"
##  [31] "2010-09-03" "2010-09-10" "2010-09-17" "2010-09-24" "2010-10-01"
##  [36] "2010-10-08" "2010-10-15" "2010-10-22" "2010-10-29" "2010-11-05"
##  [41] "2010-11-12" "2010-11-19" "2010-11-26" "2010-12-03" "2010-12-10"
##  [46] "2010-12-17" "2010-12-24" "2010-12-31" "2011-01-07" "2011-01-14"
##  [51] "2011-01-21" "2011-01-28" "2011-02-04" "2011-02-11" "2011-02-18"
##  [56] "2011-02-25" "2011-03-04" "2011-03-11" "2011-03-18" "2011-03-25"
##  [61] "2011-04-01" "2011-04-08" "2011-04-15" "2011-04-22" "2011-04-29"
##  [66] "2011-05-06" "2011-05-13" "2011-05-20" "2011-05-27" "2011-06-03"
##  [71] "2011-06-10" "2011-06-17" "2011-06-24" "2011-07-01" "2011-07-08"
##  [76] "2011-07-15" "2011-07-22" "2011-07-29" "2011-08-05" "2011-08-12"
##  [81] "2011-08-19" "2011-08-26" "2011-09-02" "2011-09-09" "2011-09-16"
##  [86] "2011-09-23" "2011-09-30" "2011-10-07" "2011-10-14" "2011-10-21"
##  [91] "2011-10-28" "2011-11-04" "2011-11-11" "2011-11-18" "2011-11-25"
##  [96] "2011-12-02" "2011-12-09" "2011-12-16" "2011-12-23" "2011-12-30"
## [101] "2012-01-06" "2012-01-13" "2012-01-20" "2012-01-27"
test_Date_index
##  [1] "2012-02-03" "2012-02-10" "2012-02-17" "2012-02-24" "2012-03-02"
##  [6] "2012-03-09" "2012-03-16" "2012-03-23" "2012-03-30" "2012-04-06"
## [11] "2012-04-13" "2012-04-20" "2012-04-27" "2012-05-04" "2012-05-11"
## [16] "2012-05-18" "2012-05-25" "2012-06-01" "2012-06-08" "2012-06-15"
## [21] "2012-06-22" "2012-06-29" "2012-07-06" "2012-07-13" "2012-07-20"
## [26] "2012-07-27" "2012-08-03" "2012-08-10" "2012-08-17" "2012-08-24"
## [31] "2012-08-31" "2012-09-07" "2012-09-14" "2012-09-21" "2012-09-28"
## [36] "2012-10-05" "2012-10-12" "2012-10-19" "2012-10-26"
#create wide form train data
trainwide<-dcast(train,Date~Store, fun.aggregate = sum, value.var = "Store_Weekly_Sales") %>% select(-"Date")

#xts train data
train_xts<-xts(trainwide,order.by = train_Date_index)
head(train_xts)
##                  1       2        3       4        5       6        7
## 2010-02-05 1643691 2136989 461622.2 2135144 317173.1 1652635 496725.4
## 2010-02-12 1641957 2137810 420729.0 2188307 311825.7 1606284 524104.9
## 2010-02-19 1611968 2124452 421642.2 2049860 303447.6 1567138 506760.5
## 2010-02-26 1409728 1865097 407204.9 1925729 270281.6 1432953 496083.2
## 2010-03-05 1554807 1991013 415202.0 1971057 288855.7 1601349 491419.5
## 2010-03-12 1439542 1990484 384200.7 1894324 297293.6 1558621 480452.1
##                    8        9      10      11        12      13      14
## 2010-02-05 1004137.1 549505.6 2193049 1528009 1100046.4 1967221 2623470
## 2010-02-12  994801.4 552677.5 2176029 1574684 1117863.3 2030933 1704219
## 2010-02-19  963960.4 511327.9 2113433 1503299 1095421.6 1970275 2204557
## 2010-02-26  847592.1 473773.3 2006775 1336405 1048617.2 1817850 2095592
## 2010-03-05  881503.9 507297.9 1987090 1426623 1077018.3 1939980 2237545
## 2010-03-12  860336.2 494145.8 1941346 1331883  985594.2 1840687 2156035
##                  15       16       17      18      19      20       21
## 2010-02-05 652122.4 477409.3 789036.0 1205308 1507637 2401395 798593.9
## 2010-02-12 682447.1 472044.3 841951.9 1187881 1536550 2109108 809321.4
## 2010-02-19 660838.8 469868.7 800714.0 1150663 1515976 2161550 867283.2
## 2010-02-26 564883.2 443242.2 749549.6 1068157 1373270 1898194 749597.2
## 2010-03-05 605325.4 444181.8 783300.1 1179738 1495845 2119214 747444.3
## 2010-03-12 604173.6 445393.7 763961.8 1138800 1467889 2010975 712312.9
##                   22      23      24       25        26      27      28
## 2010-02-05 1033017.4 1364722 1388726 677231.6 1034119.2 1874290 1672352
## 2010-02-12 1022571.2 1380892 1414107 583364.0 1015684.1 1745363 1558968
## 2010-02-19  988467.6 1319588 1385362 676260.7  999348.6 1945070 1491300
## 2010-02-26  899761.5 1198710 1158723 628516.6  855385.0 1390934 1542173
## 2010-03-05 1009201.2 1311176 1412387 665750.1 1005669.6 1313730 1608435
## 2010-03-12  967187.4 1408083 1309340 660620.0  963382.1 1925113 1326877
##                  29       30      31      32       33       34      35
## 2010-02-05 538634.5 465108.5 1469252 1087616 274593.4 956229.0 1230614
## 2010-02-12 529672.9 497374.6 1543947 1123566 294882.8 994611.0 1168815
## 2010-02-19 542399.1 463513.3 1473387 1082559 296850.8 983963.1 1270659
## 2010-02-26 488417.6 472330.7 1344354 1053247 284052.8 905756.1 1020652
## 2010-03-05 535087.9 472591.1 1384871 1066567 291484.9 918295.8 1162610
## 2010-03-12 519042.5 468189.9 1366193 1093319 312161.0 921247.9 1150344
##                  36       37       38      39        40        41       42
## 2010-02-05 467546.7 536006.7 358496.1 1230597 1001943.8 1086533.2 543384.0
## 2010-02-12 469563.7 529852.7 342214.9 1266229  955338.3 1075656.3 575710.0
## 2010-02-19 470281.0 510382.5 327237.9 1230592  916289.2 1052034.7 508794.9
## 2010-02-26 447519.4 513615.8 334222.7 1168582  863917.4  991941.7 491510.6
## 2010-03-05 480203.4 519255.7 372239.9 1266254  990152.3 1063557.5 554972.4
## 2010-03-12 441434.2 513015.3 342023.9 1244392  899352.4 1023997.7 588363.6
##                  43       44       45
## 2010-02-05 647029.3 281091.0 890689.5
## 2010-02-12 682919.0 286857.1 656988.6
## 2010-02-19 658997.6 267956.3 841264.0
## 2010-02-26 618702.8 273079.1 741891.7
## 2010-03-05 658600.1 284617.3 777951.2
## 2010-03-12 645386.9 272190.8 765687.4
#create wide form test data
testwide<-dcast(test, Date~Store, fun.aggregate = sum, value.var = "Store_Weekly_Sales") %>%select(-"Date")
#create xts test data
test_xts<-xts(testwide,order.by = test_Date_index)

Build seasonal auto.arima model

#checking ACF and PACF
lapply(train_xts, function(x) acf(x))

## $`1`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.299  0.164  0.082  0.199 -0.189 -0.114 -0.029 -0.072 -0.077 
##     70     77     84     91     98    105    112    119    126    133 
## -0.013 -0.021 -0.029  0.003 -0.004 -0.017  0.030  0.014 -0.004 -0.041 
##    140 
## -0.018 
## 
## $`2`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.393  0.259  0.105  0.191 -0.145 -0.114 -0.024 -0.053 -0.100 
##     70     77     84     91     98    105    112    119    126    133 
## -0.061 -0.037 -0.059 -0.097 -0.076 -0.053  0.000 -0.020 -0.027 -0.061 
##    140 
## -0.057 
## 
## $`3`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.522  0.377  0.381  0.305 -0.016 -0.004  0.064 -0.034 -0.009 
##     70     77     84     91     98    105    112    119    126    133 
##  0.008  0.012 -0.038 -0.005 -0.073 -0.132 -0.096 -0.095 -0.140 -0.134 
##    140 
## -0.115 
## 
## $`4`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.415  0.291  0.210  0.345  0.029  0.041  0.096  0.077  0.063 
##     70     77     84     91     98    105    112    119    126    133 
##  0.047  0.065  0.025  0.014  0.028  0.040  0.024  0.035  0.037  0.022 
##    140 
## -0.029 
## 
## $`5`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.367  0.277  0.228  0.287 -0.084 -0.119 -0.085 -0.080 -0.062 
##     70     77     84     91     98    105    112    119    126    133 
## -0.077 -0.015  0.017 -0.029 -0.055 -0.047  0.003 -0.065 -0.065 -0.108 
##    140 
## -0.071 
## 
## $`6`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.455  0.281  0.139  0.166 -0.169 -0.183 -0.183 -0.218 -0.235 
##     70     77     84     91     98    105    112    119    126    133 
## -0.198 -0.151 -0.135 -0.164 -0.133 -0.060 -0.003 -0.013  0.009  0.002 
##    140 
##  0.033 
## 
## $`7`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.638  0.514  0.368  0.391  0.170  0.053 -0.048 -0.130 -0.146 
##     70     77     84     91     98    105    112    119    126    133 
## -0.165 -0.143 -0.158 -0.134 -0.118 -0.056 -0.017 -0.001  0.037  0.064 
##    140 
##  0.138 
## 
## $`8`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.367  0.251  0.133  0.189 -0.191 -0.086 -0.017  0.001 -0.023 
##     70     77     84     91     98    105    112    119    126    133 
## -0.009 -0.018 -0.050 -0.069 -0.059 -0.025  0.035  0.011 -0.046 -0.050 
##    140 
## -0.095 
## 
## $`9`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.428  0.297  0.206  0.253 -0.113 -0.054 -0.019  0.000 -0.057 
##     70     77     84     91     98    105    112    119    126    133 
## -0.042 -0.013 -0.017 -0.077 -0.060 -0.019  0.024 -0.011 -0.026 -0.032 
##    140 
## -0.028 
## 
## $`10`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.456  0.310  0.181  0.212 -0.125 -0.095 -0.063 -0.093 -0.101 
##     70     77     84     91     98    105    112    119    126    133 
## -0.074 -0.055 -0.085 -0.101 -0.056 -0.038 -0.028 -0.005  0.040  0.007 
##    140 
## -0.076 
## 
## $`11`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.407  0.223  0.079  0.118 -0.233 -0.134 -0.031 -0.056 -0.081 
##     70     77     84     91     98    105    112    119    126    133 
## -0.041 -0.039 -0.081 -0.101 -0.060 -0.041  0.029  0.058  0.106  0.020 
##    140 
## -0.041 
## 
## $`12`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.364  0.243  0.117  0.199 -0.198 -0.120 -0.102 -0.101 -0.118 
##     70     77     84     91     98    105    112    119    126    133 
## -0.046 -0.053 -0.002 -0.049 -0.039 -0.021  0.081  0.004 -0.037 -0.079 
##    140 
## -0.101 
## 
## $`13`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.399  0.262  0.090  0.189 -0.166 -0.103 -0.083 -0.073 -0.110 
##     70     77     84     91     98    105    112    119    126    133 
## -0.061 -0.079 -0.115 -0.125 -0.079 -0.070  0.013  0.047  0.058  0.012 
##    140 
##  0.020 
## 
## $`14`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.264  0.262  0.042  0.143 -0.236 -0.102 -0.096 -0.067 -0.098 
##     70     77     84     91     98    105    112    119    126    133 
## -0.059 -0.014 -0.135 -0.036 -0.062  0.029 -0.034  0.061 -0.015 -0.004 
##    140 
##  0.003 
## 
## $`15`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.408  0.234  0.113  0.157 -0.225 -0.194 -0.166 -0.132 -0.158 
##     70     77     84     91     98    105    112    119    126    133 
## -0.134 -0.111 -0.108 -0.132 -0.095 -0.038 -0.016 -0.020 -0.058 -0.029 
##    140 
##  0.003 
## 
## $`16`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.597  0.430  0.259  0.269  0.050 -0.008 -0.068 -0.065 -0.067 
##     70     77     84     91     98    105    112    119    126    133 
## -0.129 -0.166 -0.163 -0.137 -0.122 -0.098 -0.079 -0.122 -0.143 -0.091 
##    140 
## -0.031 
## 
## $`17`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.148  0.321  0.161  0.156 -0.118  0.106 -0.003  0.026 -0.028 
##     70     77     84     91     98    105    112    119    126    133 
##  0.095  0.095  0.004  0.022  0.062  0.143 -0.180  0.092 -0.099 -0.105 
##    140 
##  0.033 
## 
## $`18`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.435  0.337  0.161  0.189 -0.156 -0.068 -0.026 -0.028 -0.067 
##     70     77     84     91     98    105    112    119    126    133 
## -0.048 -0.050 -0.165 -0.094 -0.120 -0.053 -0.027  0.081  0.003  0.003 
##    140 
##  0.010 
## 
## $`19`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.353  0.180 -0.004  0.066 -0.215 -0.142 -0.110 -0.078 -0.097 
##     70     77     84     91     98    105    112    119    126    133 
## -0.065 -0.008 -0.048 -0.060 -0.018  0.076  0.066  0.025 -0.089 -0.081 
##    140 
## -0.045 
## 
## $`20`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.396  0.279  0.126  0.155 -0.161 -0.055 -0.043 -0.048 -0.079 
##     70     77     84     91     98    105    112    119    126    133 
## -0.033 -0.053 -0.099 -0.126 -0.093 -0.079 -0.055 -0.018 -0.067 -0.034 
##    140 
## -0.069 
## 
## $`21`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.385  0.151  0.069  0.176 -0.200 -0.264 -0.190 -0.107 -0.132 
##     70     77     84     91     98    105    112    119    126    133 
## -0.136 -0.126 -0.072 -0.038 -0.026 -0.026  0.054  0.128  0.115 -0.002 
##    140 
## -0.048 
## 
## $`22`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.420  0.270  0.121  0.136 -0.259 -0.160 -0.110 -0.093 -0.178 
##     70     77     84     91     98    105    112    119    126    133 
## -0.110 -0.111 -0.157 -0.114 -0.104 -0.065 -0.017  0.060 -0.035 -0.050 
##    140 
## -0.032 
## 
## $`23`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.518  0.312  0.112  0.068 -0.221 -0.192 -0.184 -0.140 -0.152 
##     70     77     84     91     98    105    112    119    126    133 
## -0.133 -0.120 -0.136 -0.099 -0.081 -0.023  0.040  0.068 -0.019 -0.035 
##    140 
## -0.052 
## 
## $`24`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.302  0.161  0.009  0.134 -0.181 -0.129 -0.091 -0.091 -0.092 
##     70     77     84     91     98    105    112    119    126    133 
## -0.067 -0.043 -0.167 -0.124 -0.083  0.000 -0.107  0.002 -0.013 -0.016 
##    140 
##  0.043 
## 
## $`25`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.550  0.369  0.239  0.188 -0.138 -0.152 -0.148 -0.159 -0.199 
##     70     77     84     91     98    105    112    119    126    133 
## -0.178 -0.160 -0.170 -0.179 -0.136 -0.109 -0.063 -0.007 -0.041 -0.045 
##    140 
## -0.031 
## 
## $`26`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.331  0.258  0.024  0.161 -0.173 -0.117 -0.091 -0.056 -0.049 
##     70     77     84     91     98    105    112    119    126    133 
## -0.073 -0.005 -0.099 -0.061 -0.093 -0.016 -0.041  0.012 -0.033 -0.009 
##    140 
## -0.008 
## 
## $`27`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.311  0.223  0.086  0.154 -0.235 -0.126 -0.153 -0.118 -0.140 
##     70     77     84     91     98    105    112    119    126    133 
## -0.105 -0.068 -0.136 -0.094 -0.053  0.019 -0.033  0.042 -0.017 -0.035 
##    140 
##  0.026 
## 
## $`28`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.102  0.101  0.024  0.338 -0.147 -0.127 -0.109 -0.059  0.073 
##     70     77     84     91     98    105    112    119    126    133 
## -0.062 -0.098 -0.034  0.152 -0.042 -0.136  0.007  0.064  0.028 -0.143 
##    140 
## -0.094 
## 
## $`29`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.375  0.199  0.147  0.195 -0.234 -0.176 -0.087 -0.120 -0.172 
##     70     77     84     91     98    105    112    119    126    133 
## -0.108 -0.093 -0.125 -0.099 -0.117 -0.085 -0.026  0.007 -0.062 -0.084 
##    140 
## -0.030 
## 
## $`30`
## 
## Autocorrelations of series 'x', by lag
## 
##     0     7    14    21    28    35    42    49    56    63    70    77 
## 1.000 0.409 0.447 0.486 0.524 0.231 0.316 0.464 0.210 0.262 0.267 0.272 
##    84    91    98   105   112   119   126   133   140 
## 0.130 0.225 0.194 0.171 0.161 0.160 0.142 0.115 0.106 
## 
## $`31`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.267  0.179  0.144  0.366 -0.094 -0.051  0.025  0.007 -0.029 
##     70     77     84     91     98    105    112    119    126    133 
## -0.014  0.015 -0.026  0.012  0.015  0.002  0.030  0.051  0.054 -0.007 
##    140 
## -0.032 
## 
## $`32`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.366  0.254  0.129  0.266 -0.131 -0.090 -0.084 -0.028 -0.090 
##     70     77     84     91     98    105    112    119    126    133 
## -0.063 -0.086 -0.079 -0.089 -0.074 -0.061 -0.005  0.044  0.039  0.031 
##    140 
## -0.008 
## 
## $`33`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.520  0.217  0.318  0.773  0.585  0.180  0.157  0.493  0.619 
##     70     77     84     91     98    105    112    119    126    133 
##  0.197  0.030  0.183  0.500  0.192 -0.091 -0.081  0.261  0.202 -0.131 
##    140 
## -0.211 
## 
## $`34`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.303  0.214  0.123  0.288 -0.102 -0.056  0.003 -0.016 -0.048 
##     70     77     84     91     98    105    112    119    126    133 
## -0.063 -0.033 -0.045 -0.089 -0.060 -0.022 -0.009 -0.048 -0.018 -0.023 
##    140 
## -0.051 
## 
## $`35`
## 
## Autocorrelations of series 'x', by lag
## 
##     0     7    14    21    28    35    42    49    56    63    70    77 
## 1.000 0.471 0.371 0.307 0.419 0.084 0.080 0.097 0.103 0.075 0.135 0.196 
##    84    91    98   105   112   119   126   133   140 
## 0.146 0.129 0.156 0.196 0.179 0.163 0.163 0.155 0.158 
## 
## $`36`
## 
## Autocorrelations of series 'x', by lag
## 
##     0     7    14    21    28    35    42    49    56    63    70    77 
## 1.000 0.854 0.778 0.771 0.844 0.766 0.679 0.652 0.707 0.707 0.625 0.548 
##    84    91    98   105   112   119   126   133   140 
## 0.575 0.602 0.523 0.441 0.427 0.466 0.423 0.344 0.304 
## 
## $`37`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000 -0.011  0.186  0.155  0.218  0.057  0.109  0.121  0.037  0.070 
##     70     77     84     91     98    105    112    119    126    133 
##  0.042  0.185 -0.073  0.018 -0.012  0.024  0.001 -0.035 -0.124  0.004 
##    140 
## -0.072 
## 
## $`38`
## 
## Autocorrelations of series 'x', by lag
## 
##     0     7    14    21    28    35    42    49    56    63    70    77 
## 1.000 0.420 0.418 0.405 0.658 0.558 0.346 0.342 0.419 0.628 0.311 0.279 
##    84    91    98   105   112   119   126   133   140 
## 0.281 0.574 0.332 0.193 0.213 0.355 0.362 0.184 0.105 
## 
## $`39`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.452  0.339  0.253  0.362 -0.014 -0.019 -0.023 -0.044 -0.073 
##     70     77     84     91     98    105    112    119    126    133 
## -0.040 -0.030 -0.044 -0.021  0.022  0.038  0.084  0.140  0.125  0.065 
##    140 
##  0.039 
## 
## $`40`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.299  0.247  0.006  0.089 -0.221 -0.113 -0.095 -0.059 -0.031 
##     70     77     84     91     98    105    112    119    126    133 
## -0.048 -0.051 -0.138 -0.084 -0.124 -0.043 -0.055  0.022 -0.008 -0.003 
##    140 
##  0.048 
## 
## $`41`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.371  0.321  0.182  0.318 -0.082 -0.007 -0.016 -0.032 -0.069 
##     70     77     84     91     98    105    112    119    126    133 
## -0.014 -0.016 -0.062 -0.066  0.010  0.029  0.053  0.076  0.115  0.064 
##    140 
##  0.073 
## 
## $`42`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.134 -0.519 -0.170  0.720  0.444 -0.342 -0.393  0.338  0.687 
##     70     77     84     91     98    105    112    119    126    133 
## -0.093 -0.517 -0.016  0.690  0.168 -0.468 -0.261  0.476  0.373 -0.284 
##    140 
## -0.433 
## 
## $`43`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.372  0.201  0.317  0.649  0.379  0.175  0.191  0.307  0.445 
##     70     77     84     91     98    105    112    119    126    133 
##  0.155  0.055  0.039  0.329  0.191 -0.052 -0.054  0.184  0.209 -0.104 
##    140 
## -0.063 
## 
## $`44`
## 
## Autocorrelations of series 'x', by lag
## 
##     0     7    14    21    28    35    42    49    56    63    70    77 
## 1.000 0.106 0.301 0.275 0.398 0.210 0.214 0.233 0.242 0.101 0.129 0.221 
##    84    91    98   105   112   119   126   133   140 
## 0.169 0.143 0.088 0.138 0.190 0.114 0.019 0.133 0.163 
## 
## $`45`
## 
## Autocorrelations of series 'x', by lag
## 
##      0      7     14     21     28     35     42     49     56     63 
##  1.000  0.378  0.234  0.103  0.160 -0.191 -0.113 -0.087 -0.083 -0.120 
##     70     77     84     91     98    105    112    119    126    133 
## -0.083 -0.098 -0.122 -0.114 -0.121 -0.094 -0.050 -0.040 -0.078 -0.055 
##    140 
## -0.032
lapply(train_xts, function(x) pacf(x))

## $`1`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.299  0.082  0.014  0.177 -0.338 -0.005  0.072 -0.133  0.118 -0.040 
##     77     84     91     98    105    112    119    126    133    140 
## -0.091  0.084 -0.056 -0.015  0.043 -0.014  0.002  0.007 -0.081  0.002 
## 
## $`2`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.393  0.123 -0.039  0.162 -0.326 -0.012  0.145 -0.142  0.045 -0.024 
##     77     84     91     98    105    112    119    126    133    140 
## -0.102  0.064 -0.101 -0.047  0.055  0.002 -0.012 -0.038 -0.102 -0.018 
## 
## $`3`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.522  0.143  0.192  0.033 -0.361  0.017  0.117 -0.015  0.149 -0.119 
##     77     84     91     98    105    112    119    126    133    140 
## -0.047 -0.011 -0.013 -0.048 -0.093  0.020  0.004 -0.035 -0.024 -0.106 
## 
## $`4`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.415  0.143  0.059  0.262 -0.270  0.018  0.113 -0.093  0.163 -0.050 
##     77     84     91     98    105    112    119    126    133    140 
## -0.040  0.055 -0.093  0.079  0.029 -0.046  0.085 -0.056 -0.010 -0.023 
## 
## $`5`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.367  0.165  0.098  0.177 -0.328 -0.139  0.001 -0.010  0.175 -0.032 
##     77     84     91     98    105    112    119    126    133    140 
## -0.028  0.011 -0.134 -0.013 -0.018  0.053  0.011 -0.052 -0.115 -0.064 
## 
## $`6`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.455  0.093 -0.025  0.116 -0.365 -0.031 -0.009 -0.171  0.046 -0.109 
##     77     84     91     98    105    112    119    126    133    140 
## -0.064 -0.001 -0.206 -0.034  0.003 -0.049 -0.007 -0.090 -0.120  0.010 
## 
## $`7`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.638  0.181 -0.028  0.200 -0.282 -0.120 -0.029 -0.181  0.124  0.009 
##     77     84     91     98    105    112    119    126    133    140 
##  0.010  0.030 -0.079 -0.011  0.046  0.012  0.010  0.054 -0.006  0.111 
## 
## $`8`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.367  0.134  0.005  0.132 -0.366  0.050  0.109 -0.029  0.109 -0.136 
##     77     84     91     98    105    112    119    126    133    140 
## -0.058  0.017 -0.068  0.056  0.008  0.044 -0.012 -0.136 -0.016 -0.094 
## 
## $`9`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.428  0.139  0.046  0.153 -0.369  0.051  0.065 -0.012  0.084 -0.132 
##     77     84     91     98    105    112    119    126    133    140 
##  0.008  0.018 -0.097  0.053 -0.008  0.059  0.006 -0.103 -0.018 -0.025 
## 
## $`10`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.456  0.129 -0.001  0.129 -0.355  0.032  0.067 -0.117  0.112 -0.099 
##     77     84     91     98    105    112    119    126    133    140 
## -0.044  0.018 -0.135  0.087 -0.007 -0.034  0.077 -0.048 -0.029 -0.108 
## 
## $`11`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.407  0.068 -0.040  0.103 -0.377  0.088  0.113 -0.147  0.093 -0.122 
##     77     84     91     98    105    112    119    126    133    140 
## -0.070  0.063 -0.154  0.054  0.006  0.008  0.097 -0.019 -0.091 -0.061 
## 
## $`12`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.364  0.127 -0.008  0.157 -0.381  0.015  0.029 -0.108  0.118 -0.081 
##     77     84     91     98    105    112    119    126    133    140 
## -0.061  0.102 -0.159  0.014  0.049  0.026  0.022 -0.120 -0.108 -0.080 
## 
## $`13`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.399  0.122 -0.061  0.174 -0.354  0.033  0.063 -0.136  0.097 -0.081 
##     77     84     91     98    105    112    119    126    133    140 
## -0.095 -0.007 -0.114  0.029 -0.005  0.053  0.046 -0.056 -0.026 -0.038 
## 
## $`14`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.264  0.206 -0.076  0.107 -0.320 -0.031  0.074 -0.069  0.027 -0.091 
##     77     84     91     98    105    112    119    126    133    140 
## -0.005 -0.129  0.017 -0.008  0.016  0.012 -0.025 -0.042 -0.055  0.048 
## 
## $`15`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.408  0.081 -0.010  0.119 -0.402 -0.005 -0.003 -0.077  0.073 -0.161 
##     77     84     91     98    105    112    119    126    133    140 
## -0.064 -0.048 -0.142  0.034 -0.026 -0.042 -0.021 -0.191 -0.021  0.004 
## 
## $`16`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.597  0.114 -0.058  0.162 -0.264 -0.008  0.000 -0.054  0.088 -0.160 
##     77     84     91     98    105    112    119    126    133    140 
## -0.061  0.007 -0.053  0.059 -0.013 -0.035 -0.114 -0.087  0.096  0.040 
## 
## $`17`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.148  0.305  0.093  0.040 -0.238  0.071  0.075  0.021 -0.044  0.039 
##     77     84     91     98    105    112    119    126    133    140 
##  0.148 -0.058 -0.073  0.024  0.220 -0.250 -0.026 -0.047 -0.045  0.227 
## 
## $`18`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.435  0.182 -0.049  0.108 -0.352  0.066  0.134 -0.081  0.051 -0.123 
##     77     84     91     98    105    112    119    126    133    140 
## -0.048 -0.097  0.041 -0.027  0.015  0.081  0.003 -0.066 -0.046  0.017 
## 
## $`19`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.353  0.063 -0.098  0.098 -0.291  0.003  0.016 -0.092  0.011 -0.077 
##     77     84     91     98    105    112    119    126    133    140 
##  0.013 -0.060 -0.069  0.032  0.053  0.022 -0.043 -0.162 -0.035  0.040 
## 
## $`20`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.396  0.145 -0.031  0.098 -0.309  0.075  0.053 -0.071  0.039 -0.073 
##     77     84     91     98    105    112    119    126    133    140 
## -0.031 -0.047 -0.094  0.011 -0.012  0.001  0.020 -0.132  0.023 -0.064 
## 
## $`21`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.385  0.004  0.012  0.170 -0.392 -0.093 -0.004 -0.079  0.076 -0.108 
##     77     84     91     98    105    112    119    126    133    140 
## -0.151 -0.015 -0.053 -0.002 -0.013  0.003  0.064 -0.008 -0.122 -0.095 
## 
## $`22`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.420  0.113 -0.033  0.086 -0.425  0.075  0.067 -0.082 -0.008 -0.157 
##     77     84     91     98    105    112    119    126    133    140 
## -0.061 -0.065  0.010 -0.082 -0.035  0.039 -0.012 -0.127 -0.101 -0.031 
## 
## $`23`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.518  0.061 -0.099  0.041 -0.347  0.061 -0.005 -0.061  0.005 -0.152 
##     77     84     91     98    105    112    119    126    133    140 
## -0.030 -0.093 -0.008 -0.022 -0.020  0.065 -0.063 -0.144 -0.037 -0.063 
## 
## $`24`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.302  0.077 -0.066  0.150 -0.283 -0.033  0.031 -0.126  0.051 -0.062 
##     77     84     91     98    105    112    119    126    133    140 
## -0.056 -0.142 -0.077 -0.009  0.011 -0.109  0.015 -0.072 -0.091  0.116 
## 
## $`25`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.550  0.096  0.003  0.048 -0.394  0.047  0.019 -0.061  0.025 -0.149 
##     77     84     91     98    105    112    119    126    133    140 
## -0.045 -0.051 -0.091  0.030 -0.061  0.014  0.034 -0.188 -0.016 -0.046 
## 
## $`26`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.331  0.167 -0.118  0.165 -0.289 -0.054  0.097 -0.104  0.088 -0.093 
##     77     84     91     98    105    112    119    126    133    140 
## -0.016 -0.068 -0.062  0.010 -0.014  0.011  0.003 -0.078 -0.009  0.000 
## 
## $`27`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.311  0.139 -0.019  0.118 -0.359 -0.006 -0.031 -0.068  0.053 -0.139 
##     77     84     91     98    105    112    119    126    133    140 
##  0.004 -0.147 -0.057  0.035 -0.012 -0.011 -0.022 -0.111 -0.083  0.079 
## 
## $`28`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.102  0.091  0.006  0.332 -0.241 -0.160 -0.054 -0.160  0.308 -0.032 
##     77     84     91     98    105    112    119    126    133    140 
## -0.150  0.048 -0.046  0.020 -0.058 -0.004  0.033  0.029 -0.105 -0.141 
## 
## $`29`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.375  0.069  0.061  0.132 -0.428  0.011  0.042 -0.108  0.082 -0.142 
##     77     84     91     98    105    112    119    126    133    140 
## -0.097 -0.004 -0.082 -0.076 -0.010 -0.002 -0.014 -0.105 -0.151 -0.024 
## 
## $`30`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.409  0.336  0.308  0.305 -0.219 -0.061  0.274 -0.130  0.082 -0.045 
##     77     84     91     98    105    112    119    126    133    140 
## -0.057  0.055  0.005 -0.033  0.099 -0.014 -0.028  0.002 -0.024 -0.038 
## 
## $`31`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.267  0.116  0.077  0.324 -0.324 -0.046  0.068 -0.137  0.206 -0.048 
##     77     84     91     98    105    112    119    126    133    140 
## -0.066  0.070 -0.086  0.070  0.015  0.003  0.073 -0.018 -0.037 -0.071 
## 
## $`32`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.366  0.139 -0.003  0.224 -0.363 -0.013  0.034 -0.061  0.111 -0.092 
##     77     84     91     98    105    112    119    126    133    140 
## -0.088 -0.011 -0.068  0.020  0.011  0.020  0.068 -0.032 -0.013 -0.073 
## 
## $`33`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.520 -0.073  0.324  0.731 -0.062 -0.292  0.066 -0.137  0.273 -0.177 
##     77     84     91     98    105    112    119    126    133    140 
## -0.116 -0.175  0.038 -0.147 -0.069 -0.063 -0.073  0.069  0.015  0.063 
## 
## $`34`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.303  0.134  0.029  0.247 -0.303 -0.031  0.085 -0.112  0.122 -0.088 
##     77     84     91     98    105    112    119    126    133    140 
## -0.066  0.063 -0.150  0.062  0.031 -0.057  0.050 -0.065 -0.035 -0.005 
## 
## $`35`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.471  0.191  0.101  0.275 -0.321 -0.006  0.069 -0.047  0.197  0.060 
##     77     84     91     98    105    112    119    126    133    140 
##  0.075  0.000 -0.074  0.054  0.073  0.100  0.054 -0.013 -0.020  0.015 
## 
## $`36`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.854  0.181  0.275  0.483 -0.258 -0.158  0.028  0.111  0.190 -0.118 
##     77     84     91     98    105    112    119    126    133    140 
## -0.241  0.045  0.046 -0.152  0.054 -0.092 -0.003  0.018 -0.078  0.027 
## 
## $`37`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
## -0.011  0.186  0.165  0.204  0.023  0.020  0.055 -0.029  0.009 -0.008 
##     77     84     91     98    105    112    119    126    133    140 
##  0.152 -0.086 -0.078 -0.063 -0.017  0.041 -0.033 -0.153  0.008 -0.034 
## 
## $`38`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.420  0.293  0.209  0.525  0.334 -0.091 -0.117 -0.148  0.286 -0.047 
##     77     84     91     98    105    112    119    126    133    140 
## -0.078 -0.109  0.210 -0.062 -0.144 -0.048 -0.025 -0.059  0.082 -0.073 
## 
## $`39`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.452  0.169  0.064  0.249 -0.375 -0.019  0.029 -0.126  0.197 -0.049 
##     77     84     91     98    105    112    119    126    133    140 
## -0.044  0.053 -0.100  0.105  0.046  0.050  0.153 -0.092 -0.059 -0.044 
## 
## $`40`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.299  0.173 -0.122  0.085 -0.274 -0.021  0.064 -0.073  0.070 -0.102 
##     77     84     91     98    105    112    119    126    133    140 
## -0.071 -0.097 -0.045 -0.035 -0.002 -0.023 -0.015 -0.038 -0.064  0.068 
## 
## $`41`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.371  0.213  0.011  0.236 -0.349  0.001  0.073 -0.129  0.152 -0.046 
##     77     84     91     98    105    112    119    126    133    140 
## -0.050  0.016 -0.119  0.126  0.055  0.029  0.098 -0.047 -0.021  0.034 
## 
## $`42`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.134 -0.547  0.015  0.681  0.291  0.070 -0.045 -0.192  0.356 -0.108 
##     77     84     91     98    105    112    119    126    133    140 
## -0.083 -0.058  0.033 -0.101  0.003  0.042 -0.015 -0.128  0.029 -0.075 
## 
## $`43`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.372  0.073  0.257  0.566  0.072 -0.057 -0.116 -0.220  0.307 -0.063 
##     77     84     91     98    105    112    119    126    133    140 
## -0.081 -0.238 -0.020  0.170 -0.062  0.055 -0.008 -0.021 -0.077  0.061 
## 
## $`44`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.106  0.293  0.245  0.340  0.109 -0.001  0.011  0.029 -0.094 -0.081 
##     77     84     91     98    105    112    119    126    133    140 
##  0.081  0.088  0.090 -0.025 -0.050  0.073  0.038 -0.129 -0.040  0.093 
## 
## $`45`
## 
## Partial autocorrelations of series 'x', by lag
## 
##      7     14     21     28     35     42     49     56     63     70 
##  0.378  0.106 -0.019  0.125 -0.346  0.034  0.030 -0.087  0.049 -0.108 
##     77     84     91     98    105    112    119    126    133    140 
## -0.072 -0.038 -0.072 -0.051 -0.022 -0.006 -0.047 -0.092 -0.047 -0.034
#fitting auto arima model
train_ts_fit<-lapply(train_xts, function(x) auto.arima(x, lambda = 0))    


#validate test set by MAE, MAPE, RMSE
fcast<-list()
sum_MAE<-0
sum_MAPE<-0
sum_RMSE<-0


for (i in c(colnames(train_xts))){
  fcast[[i]]<- forecast(auto.arima(train_xts[,i]), h = 39)$mean
  fcast.num<-as.numeric(fcast[[i]])
  test.num<-as.numeric(test_xts[,i])
  sum_MAE<-mean(abs(test.num-fcast.num))+sum_MAE
  sum_MAPE<-100*mean(abs(test.num-fcast.num)/abs(test.num))+sum_MAPE
  sum_RMSE<-sqrt(mean((test.num-fcast.num)^2))+sum_RMSE
}

#divide by 45 number of stores to get average model statistics
print(paste("arima model MAE is", sum_MAE/45))
## [1] "arima model MAE is 64933.6014515565"
print(paste("arima model MAPE is", sum_MAPE/45))
## [1] "arima model MAPE is 6.17079873690773"
print(paste("arima model RMSE is", sum_RMSE/45))
## [1] "arima model RMSE is 79707.2952189233"

Random Forst model using “ranger”

#build random forest model on training set
train_rf<-data_clean %>% filter(Date <"2012-02-03") %>% 
  group_by (Date, Store,IsHoliday, Week, Type ) %>% summarize (Store_Weekly_Sales = sum(Weekly_Sales))
  
  
test_rf<-data_clean %>% filter(Date >="2012-02-03") %>% 
  group_by (Date, Store,IsHoliday, Week, Type ) %>% summarize (Store_Weekly_Sales = sum(Weekly_Sales))   

train_rf
## # A tibble: 4,680 x 6
## # Groups:   Date, Store, IsHoliday, Week [4,680]
##    Date       Store IsHoliday  Week Type  Store_Weekly_Sales
##    <date>     <fct> <lgl>     <dbl> <fct>              <dbl>
##  1 2010-02-05 1     FALSE         5 A               1643691.
##  2 2010-02-05 2     FALSE         5 A               2136989.
##  3 2010-02-05 3     FALSE         5 B                461622.
##  4 2010-02-05 4     FALSE         5 A               2135144.
##  5 2010-02-05 5     FALSE         5 B                317173.
##  6 2010-02-05 6     FALSE         5 A               1652635.
##  7 2010-02-05 7     FALSE         5 B                496725.
##  8 2010-02-05 8     FALSE         5 A               1004137.
##  9 2010-02-05 9     FALSE         5 B                549506.
## 10 2010-02-05 10    FALSE         5 B               2193049.
## # … with 4,670 more rows
test_rf
## # A tibble: 1,755 x 6
## # Groups:   Date, Store, IsHoliday, Week [1,755]
##    Date       Store IsHoliday  Week Type  Store_Weekly_Sales
##    <date>     <fct> <lgl>     <dbl> <fct>              <dbl>
##  1 2012-02-03 1     FALSE         5 A               1636340.
##  2 2012-02-03 2     FALSE         5 A               1935300.
##  3 2012-02-03 3     FALSE         5 B                424961.
##  4 2012-02-03 4     FALSE         5 A               2173374.
##  5 2012-02-03 5     FALSE         5 B                333948 
##  6 2012-02-03 6     FALSE         5 A               1496306.
##  7 2012-02-03 7     FALSE         5 B                580453.
##  8 2012-02-03 8     FALSE         5 A                927611.
##  9 2012-02-03 9     FALSE         5 B                549968.
## 10 2012-02-03 10    FALSE         5 B               1867403.
## # … with 1,745 more rows
seed<-set.seed(10)

outcome<-"train_rf$Store_Weekly_Sales"
vars<-c("Store", "Week", "Type", "IsHoliday")
fmla<-paste(outcome, "~", paste(vars, collapse = "+"))
fit_rf<-ranger(fmla,train_rf, num.trees = 500, respect.unordered.factors = "order", seed = seed)

#predict on test set
test_rf$pred<-predict(fit_rf, test_rf)$predictions

#calculate prediction RMSE
test_rf %>% mutate(residual = Store_Weekly_Sales-pred) %>%
  summarize(RMSE = (mean(residual^2))^0.5) %>%
  magrittr::use_series(RMSE) %>% mean()
## [1] 68906.49
#plot prediction
ggplot(test_rf, aes(x= pred, y=Store_Weekly_Sales))+
  geom_point()+
  geom_line(aes(color = factor(Store)))+
  geom_abline()

XGBoost Tree Model

#data prep using "vtreat"

#create one hot encoding plan
treatplan<-designTreatmentsZ(train_rf, vars, verbose = F)

#querry new varName
newvars<-treatplan %>% 
  magrittr::use_series(scoreFrame) %>%
  filter(code %in% c("clean", "lev")) %>%
  magrittr::use_series(varName)

#prepare train/test set
train_rf_treat<-prepare(treatplan, train_rf, varRestriction = newvars)
test_rf_treat<-prepare(treatplan, test_rf, varRestriction = newvars)
str(train_rf_treat)
## 'data.frame':    4680 obs. of  50 variables:
##  $ Week          : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ IsHoliday     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_1 : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_10: num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Store_lev_x_11: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_12: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_13: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_14: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_15: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_16: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_17: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_18: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_19: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_2 : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_20: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_21: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_22: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_23: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_24: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_25: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_26: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_27: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_28: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_29: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_3 : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_30: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_31: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_32: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_33: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_34: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_35: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_36: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_37: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_38: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_39: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_4 : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Store_lev_x_40: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_41: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_42: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_43: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_44: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_45: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_5 : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ Store_lev_x_6 : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ Store_lev_x_7 : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Store_lev_x_8 : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ Store_lev_x_9 : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ Type_lev_x_A  : num  1 1 0 1 0 1 0 1 0 0 ...
##  $ Type_lev_x_B  : num  0 0 1 0 1 0 1 0 1 1 ...
##  $ Type_lev_x_C  : num  0 0 0 0 0 0 0 0 0 0 ...
str(test_rf_treat)
## 'data.frame':    1755 obs. of  50 variables:
##  $ Week          : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ IsHoliday     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_1 : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_10: num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Store_lev_x_11: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_12: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_13: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_14: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_15: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_16: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_17: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_18: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_19: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_2 : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_20: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_21: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_22: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_23: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_24: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_25: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_26: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_27: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_28: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_29: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_3 : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_30: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_31: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_32: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_33: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_34: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_35: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_36: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_37: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_38: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_39: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_4 : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Store_lev_x_40: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_41: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_42: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_43: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_44: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_45: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Store_lev_x_5 : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ Store_lev_x_6 : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ Store_lev_x_7 : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Store_lev_x_8 : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ Store_lev_x_9 : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ Type_lev_x_A  : num  1 1 0 1 0 1 0 1 0 0 ...
##  $ Type_lev_x_B  : num  0 0 1 0 1 0 1 0 1 1 ...
##  $ Type_lev_x_C  : num  0 0 0 0 0 0 0 0 0 0 ...
#xgboost cross validation on training set
cv<-xgb.cv(data=as.matrix(train_rf_treat), label = train_rf$Store_Weekly_Sales, 
           nrounds = 100, nfold = 5, max_depth = 6, eta = 0.3, 
           objective = "reg:linear", early_stopping_rounds = 10)
## [04:13:36] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [04:13:36] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [04:13:37] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [04:13:37] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [04:13:37] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:862282.400000+1628.260279    test-rmse:862886.512500+8568.557350 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [2]  train-rmse:632992.587500+918.194840 test-rmse:633014.975000+6262.622753 
## [3]  train-rmse:475023.656250+1634.537846    test-rmse:475896.987500+4848.590787 
## [4]  train-rmse:366171.556250+1973.696671    test-rmse:367492.712500+3941.187009 
## [5]  train-rmse:291985.431250+2006.600549    test-rmse:296671.537500+3891.661203 
## [6]  train-rmse:242186.887500+2610.504242    test-rmse:247743.387500+6401.768745 
## [7]  train-rmse:208834.465625+3724.237255    test-rmse:214529.840625+6333.118730 
## [8]  train-rmse:185239.771875+2840.337732    test-rmse:192620.159375+5671.530014 
## [9]  train-rmse:169638.159375+3006.191626    test-rmse:178997.725000+6223.592634 
## [10] train-rmse:157930.203125+2826.673997    test-rmse:168065.103125+6916.199015 
## [11] train-rmse:149411.940625+1979.544857    test-rmse:159659.150000+6411.843533 
## [12] train-rmse:141879.209375+2311.359854    test-rmse:152916.521875+6711.671487 
## [13] train-rmse:134846.609375+1691.227277    test-rmse:146419.187500+6322.391126 
## [14] train-rmse:129363.843750+1449.398559    test-rmse:140907.059375+5654.048145 
## [15] train-rmse:123906.337500+934.256830 test-rmse:136369.140625+5528.793028 
## [16] train-rmse:119780.826563+645.267979 test-rmse:132698.037500+5612.884757 
## [17] train-rmse:116054.971875+830.468588 test-rmse:128652.629687+5836.304871 
## [18] train-rmse:112775.440625+662.476747 test-rmse:125427.339063+5838.021699 
## [19] train-rmse:109878.657812+996.205461 test-rmse:122789.784375+5738.262154 
## [20] train-rmse:107163.754688+848.287801 test-rmse:120468.225000+5780.027733 
## [21] train-rmse:104150.137500+875.559993 test-rmse:118221.356250+6097.211212 
## [22] train-rmse:101359.656250+787.792734 test-rmse:115823.964063+5341.858740 
## [23] train-rmse:99104.551563+912.913593  test-rmse:113674.670313+5226.754461 
## [24] train-rmse:96867.920313+773.240069  test-rmse:112006.064063+5528.126017 
## [25] train-rmse:94962.770313+836.687034  test-rmse:110209.670312+5482.852506 
## [26] train-rmse:93242.723437+994.484044  test-rmse:108607.159375+5859.720974 
## [27] train-rmse:91380.498437+1036.864386 test-rmse:107303.053125+5895.066634 
## [28] train-rmse:89772.745312+725.712309  test-rmse:106014.596875+5975.810766 
## [29] train-rmse:88329.606250+720.044322  test-rmse:104906.617187+6094.042997 
## [30] train-rmse:87115.409375+1016.453762 test-rmse:103886.865625+5994.954717 
## [31] train-rmse:85713.960938+934.971476  test-rmse:102918.553125+6304.505331 
## [32] train-rmse:84374.720312+1021.587053 test-rmse:101995.021875+6413.165394 
## [33] train-rmse:83339.801563+1116.223802 test-rmse:101350.792187+6304.864848 
## [34] train-rmse:82492.592188+1129.354596 test-rmse:100690.175000+6378.735428 
## [35] train-rmse:80791.412500+915.196162  test-rmse:99462.182813+6026.812450 
## [36] train-rmse:79991.050000+998.655983  test-rmse:98938.531250+6088.102305 
## [37] train-rmse:78671.965625+787.658270  test-rmse:97947.073438+6426.951933 
## [38] train-rmse:77597.051562+499.147713  test-rmse:97447.595312+6503.626432 
## [39] train-rmse:76868.248438+499.911856  test-rmse:97011.096875+6401.480049 
## [40] train-rmse:76109.035937+247.855549  test-rmse:96359.103125+6744.507526 
## [41] train-rmse:74961.470312+610.293253  test-rmse:95597.348438+6526.024821 
## [42] train-rmse:74220.445313+625.010027  test-rmse:95290.378125+6628.321287 
## [43] train-rmse:73527.096875+475.618166  test-rmse:94544.235937+6772.329539 
## [44] train-rmse:72963.145313+681.199956  test-rmse:94134.421875+6800.539546 
## [45] train-rmse:72208.664062+438.033154  test-rmse:93705.509375+7064.347556 
## [46] train-rmse:71733.825000+492.477312  test-rmse:93357.445312+7073.152285 
## [47] train-rmse:71016.917188+493.306064  test-rmse:92856.670313+7078.902913 
## [48] train-rmse:70403.109375+450.133871  test-rmse:92495.037500+7348.722239 
## [49] train-rmse:69985.170312+338.479199  test-rmse:92229.340625+7317.593437 
## [50] train-rmse:69598.381250+492.044858  test-rmse:91947.001563+7434.438578 
## [51] train-rmse:69101.707812+555.659864  test-rmse:91609.800000+7589.462651 
## [52] train-rmse:68611.431250+777.814649  test-rmse:91399.239063+7601.262501 
## [53] train-rmse:68094.806250+696.462149  test-rmse:91284.023438+7869.567901 
## [54] train-rmse:67616.496875+701.551712  test-rmse:91076.304687+7922.174673 
## [55] train-rmse:66831.207813+663.859158  test-rmse:90502.215625+7639.559211 
## [56] train-rmse:66001.510938+416.502047  test-rmse:90157.378125+7476.217296 
## [57] train-rmse:65639.016406+390.494078  test-rmse:90075.981250+7521.955946 
## [58] train-rmse:65239.530469+525.946369  test-rmse:89927.685938+7505.828376 
## [59] train-rmse:64905.942187+490.294556  test-rmse:89753.918750+7530.250128 
## [60] train-rmse:64474.571875+577.513364  test-rmse:89771.654687+7562.762379 
## [61] train-rmse:64041.322656+556.548944  test-rmse:89654.629688+7464.357590 
## [62] train-rmse:63484.001562+783.000719  test-rmse:89407.540625+7433.533112 
## [63] train-rmse:63083.953906+799.119386  test-rmse:89437.628125+7557.980095 
## [64] train-rmse:62659.934375+720.194651  test-rmse:89347.151563+7512.773605 
## [65] train-rmse:62279.526562+592.661522  test-rmse:89231.550000+7556.660948 
## [66] train-rmse:61895.600781+542.158787  test-rmse:89002.201562+7543.509163 
## [67] train-rmse:61664.750781+565.681365  test-rmse:89049.200000+7750.741594 
## [68] train-rmse:61303.084375+727.927708  test-rmse:88899.898438+7856.735575 
## [69] train-rmse:60860.826562+751.551689  test-rmse:88738.845313+7865.001934 
## [70] train-rmse:60660.183594+750.422254  test-rmse:88615.209375+7892.156439 
## [71] train-rmse:60385.171094+672.984342  test-rmse:88448.976563+7841.022052 
## [72] train-rmse:60175.201563+722.654870  test-rmse:88480.340625+8000.503806 
## [73] train-rmse:59772.202344+706.413314  test-rmse:88301.101562+8068.716341 
## [74] train-rmse:59336.779687+802.249481  test-rmse:88203.181250+8082.382744 
## [75] train-rmse:58985.623437+743.829010  test-rmse:88099.779688+8125.924915 
## [76] train-rmse:58716.616406+829.660181  test-rmse:87996.409375+8017.077586 
## [77] train-rmse:58457.553906+915.102583  test-rmse:88022.450000+8109.123650 
## [78] train-rmse:58312.691406+896.885692  test-rmse:87945.446875+8075.271748 
## [79] train-rmse:58054.892969+900.874118  test-rmse:87985.978125+8041.083585 
## [80] train-rmse:57839.200000+873.792958  test-rmse:88031.834375+8228.601237 
## [81] train-rmse:57603.865625+869.487147  test-rmse:87996.456250+8272.252042 
## [82] train-rmse:57361.356250+908.993452  test-rmse:87910.346875+8193.713051 
## [83] train-rmse:57140.927344+906.441383  test-rmse:87855.734375+8188.471435 
## [84] train-rmse:56886.545312+814.019150  test-rmse:87838.896875+8262.783076 
## [85] train-rmse:56634.233594+769.985160  test-rmse:87920.740625+8174.171314 
## [86] train-rmse:56402.882813+851.411260  test-rmse:87849.432813+8172.347963 
## [87] train-rmse:56252.852344+822.823719  test-rmse:87863.317187+8183.240970 
## [88] train-rmse:56144.444531+813.858644  test-rmse:87816.032813+8171.278465 
## [89] train-rmse:55891.027344+868.410532  test-rmse:87819.957813+8213.104948 
## [90] train-rmse:55724.408594+849.963138  test-rmse:87841.645313+8256.783414 
## [91] train-rmse:55516.001562+851.560080  test-rmse:87824.965625+8286.632695 
## [92] train-rmse:55325.757031+809.629745  test-rmse:87862.545312+8246.908108 
## [93] train-rmse:55130.828906+751.593831  test-rmse:87844.503125+8187.714687 
## [94] train-rmse:54942.117969+825.741431  test-rmse:87921.334375+8210.278438 
## [95] train-rmse:54768.475781+851.472640  test-rmse:87934.998438+8236.115728 
## [96] train-rmse:54542.211719+863.584589  test-rmse:87980.435937+8254.714900 
## [97] train-rmse:54434.634375+903.299824  test-rmse:87989.790625+8326.303684 
## [98] train-rmse:54271.241406+866.737832  test-rmse:87955.439063+8301.000391 
## Stopping. Best iteration:
## [88] train-rmse:56144.444531+813.858644  test-rmse:87816.032813+8171.278465
#evaluation
elog<- cv$evaluation_log %>%
  summarize(ntrees.train = which.min(train_rmse_mean), ntrees.test = which.min(test_rmse_mean))
elog
##   ntrees.train ntrees.test
## 1           98          88
nrounds <-88

#fit xgboost model
fit_xgb<-xgboost(data=as.matrix(train_rf_treat), label = train_rf$Store_Weekly_Sales,
                 nrounds = nrounds, max_depth = 6, eta = 0.3,
                 objective = "reg:linear")
## [04:13:39] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:861916.750000 
## [2]  train-rmse:632046.375000 
## [3]  train-rmse:473051.156250 
## [4]  train-rmse:363791.750000 
## [5]  train-rmse:289901.718750 
## [6]  train-rmse:239562.546875 
## [7]  train-rmse:203265.859375 
## [8]  train-rmse:181006.921875 
## [9]  train-rmse:165810.312500 
## [10] train-rmse:154943.578125 
## [11] train-rmse:147005.062500 
## [12] train-rmse:139474.859375 
## [13] train-rmse:134075.046875 
## [14] train-rmse:128767.507812 
## [15] train-rmse:124468.382812 
## [16] train-rmse:120405.921875 
## [17] train-rmse:115109.187500 
## [18] train-rmse:112099.578125 
## [19] train-rmse:108541.164062 
## [20] train-rmse:106140.250000 
## [21] train-rmse:103635.875000 
## [22] train-rmse:100809.062500 
## [23] train-rmse:98928.773438 
## [24] train-rmse:97486.078125 
## [25] train-rmse:96023.820312 
## [26] train-rmse:93736.617188 
## [27] train-rmse:92661.703125 
## [28] train-rmse:90555.023438 
## [29] train-rmse:88532.289062 
## [30] train-rmse:85469.906250 
## [31] train-rmse:84673.898438 
## [32] train-rmse:83004.625000 
## [33] train-rmse:81786.062500 
## [34] train-rmse:80850.554688 
## [35] train-rmse:79487.125000 
## [36] train-rmse:78600.679688 
## [37] train-rmse:78028.843750 
## [38] train-rmse:76704.085938 
## [39] train-rmse:76265.445312 
## [40] train-rmse:75721.867188 
## [41] train-rmse:75021.039062 
## [42] train-rmse:74170.117188 
## [43] train-rmse:73394.734375 
## [44] train-rmse:71236.289062 
## [45] train-rmse:70933.601562 
## [46] train-rmse:70584.343750 
## [47] train-rmse:69906.820312 
## [48] train-rmse:69293.210938 
## [49] train-rmse:68653.718750 
## [50] train-rmse:68219.328125 
## [51] train-rmse:67500.781250 
## [52] train-rmse:66990.390625 
## [53] train-rmse:66561.617188 
## [54] train-rmse:66274.070312 
## [55] train-rmse:65882.273438 
## [56] train-rmse:65703.921875 
## [57] train-rmse:65574.531250 
## [58] train-rmse:64736.808594 
## [59] train-rmse:64405.402344 
## [60] train-rmse:64006.894531 
## [61] train-rmse:63691.015625 
## [62] train-rmse:63551.902344 
## [63] train-rmse:63418.898438 
## [64] train-rmse:63176.003906 
## [65] train-rmse:62886.625000 
## [66] train-rmse:62282.394531 
## [67] train-rmse:62118.941406 
## [68] train-rmse:61525.636719 
## [69] train-rmse:61219.394531 
## [70] train-rmse:60788.832031 
## [71] train-rmse:60684.953125 
## [72] train-rmse:60328.257812 
## [73] train-rmse:59824.453125 
## [74] train-rmse:59625.078125 
## [75] train-rmse:59376.449219 
## [76] train-rmse:59187.148438 
## [77] train-rmse:59049.410156 
## [78] train-rmse:58781.226562 
## [79] train-rmse:58602.574219 
## [80] train-rmse:58440.226562 
## [81] train-rmse:58253.304688 
## [82] train-rmse:58061.941406 
## [83] train-rmse:57922.503906 
## [84] train-rmse:57780.234375 
## [85] train-rmse:57683.617188 
## [86] train-rmse:57511.320312 
## [87] train-rmse:57419.421875 
## [88] train-rmse:57107.171875
#predict test set
test_rf$xgbPred<-predict(fit_xgb, as.matrix(test_rf_treat))
str(test_rf)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  1755 obs. of  8 variables:
##  $ Date              : Date, format: "2012-02-03" "2012-02-03" ...
##  $ Store             : Factor w/ 45 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ IsHoliday         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Week              : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ Type              : Factor w/ 3 levels "A","B","C": 1 1 2 1 2 1 2 1 2 2 ...
##  $ Store_Weekly_Sales: num  1636340 1935300 424961 2173374 333948 ...
##  $ pred              : num  1559771 1932379 423340 2042503 342949 ...
##  $ xgbPred           : num  1607558 2015386 442634 2104802 333639 ...
##  - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame':   1755 obs. of  5 variables:
##   ..$ Date     : Date, format: "2012-02-03" ...
##   ..$ Store    : Factor w/ 45 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ IsHoliday: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##   ..$ Week     : num  5 5 5 5 5 5 5 5 5 5 ...
##   ..$ .rows    :List of 1755
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   .. ..$ : int 46
##   .. ..$ : int 47
##   .. ..$ : int 48
##   .. ..$ : int 49
##   .. ..$ : int 50
##   .. ..$ : int 51
##   .. ..$ : int 52
##   .. ..$ : int 53
##   .. ..$ : int 54
##   .. ..$ : int 55
##   .. ..$ : int 56
##   .. ..$ : int 57
##   .. ..$ : int 58
##   .. ..$ : int 59
##   .. ..$ : int 60
##   .. ..$ : int 61
##   .. ..$ : int 62
##   .. ..$ : int 63
##   .. ..$ : int 64
##   .. ..$ : int 65
##   .. ..$ : int 66
##   .. ..$ : int 67
##   .. ..$ : int 68
##   .. ..$ : int 69
##   .. ..$ : int 70
##   .. ..$ : int 71
##   .. ..$ : int 72
##   .. ..$ : int 73
##   .. ..$ : int 74
##   .. ..$ : int 75
##   .. ..$ : int 76
##   .. ..$ : int 77
##   .. ..$ : int 78
##   .. ..$ : int 79
##   .. ..$ : int 80
##   .. ..$ : int 81
##   .. ..$ : int 82
##   .. ..$ : int 83
##   .. ..$ : int 84
##   .. ..$ : int 85
##   .. ..$ : int 86
##   .. ..$ : int 87
##   .. ..$ : int 88
##   .. ..$ : int 89
##   .. ..$ : int 90
##   .. ..$ : int 91
##   .. ..$ : int 92
##   .. ..$ : int 93
##   .. ..$ : int 94
##   .. ..$ : int 95
##   .. ..$ : int 96
##   .. ..$ : int 97
##   .. ..$ : int 98
##   .. ..$ : int 99
##   .. .. [list output truncated]
##   ..- attr(*, ".drop")= logi TRUE
#plot prediction vs. test set
ggplot(test_rf, aes(x= xgbPred, y=Store_Weekly_Sales))+
  geom_point()+
  geom_line(aes(color = factor(Store)))+
  geom_abline()

#calculate model RMSE
test_rf %>% mutate (xgbResidual = Store_Weekly_Sales-xgbPred) %>%
  group_by(Store) %>%
  summarize (xgbRMSE_store = (mean(xgbResidual^2))^0.5) %>%
  summarize(xgbRMSE = mean(xgbRMSE_store))
## # A tibble: 1 x 1
##   xgbRMSE
##     <dbl>
## 1  79746.

Build SVM model

#prepare data